#Partisan Stability During Turbulent Times: Voter Study Group (VSG) data analysis
#Donald P. Green and Paul B. Platzman
#July 2021

#NOTE TO USER: always run all code from the start of the document until line 175, which reads: "####Creating data set [SUBJECTS WITH ANY NUMBER OF MISSING VALUES]." This reads in and processes the data in preparation for analysis. After line 175, any code chunks separated by four hashtags can be run on their own.

#Data for this script can be found at the following URL: https://www.voterstudygroup.org/data/voter-survey. You will need to provide an email address in the prompt, and the link to the data will be sent to your inbox. After the data is downloaded, it can be imported in the following manner:

####Loading data
vsg = read.csv("./VOTER_Survey_Jan217_Release1-csv.csv")



####Defining functions
PID3 = function(x){
  if(is.na(x)){
    return(NA)
  }
  else if(x==1){
    return(as.numeric(-1))
  }
  else if(x==2){
    return(as.numeric(1))
  }
  else if(x==3){
    return(as.numeric(0))
  }
  else{
    return(NA)
  }
}

PID7 = function(x){
  if(is.na(x)){
    return(NA)
  }
  else if(x==1){
    return(as.numeric(-3))
  }
  else if(x==2){
    return(as.numeric(-2))
  }
  else if(x==3){
    return(as.numeric(-1))
  }
  else if(x==4){
    return(as.numeric(0))
  }
  else if(x==5){
    return(as.numeric(1))
  }
  else if(x==6){
    return(as.numeric(2))
  }
  else if(x==7){
    return(as.numeric(3))
  }
  else{
    return(NA)
  }
}

convert_to_numeric_PID3 = function(df){
  for(i in 1:nrow(df)){
    for(j in 1:ncol(df)){
      df[i,j] = as.numeric(PID3(df[i,j]))
    }
  }
  return(df)}

convert_to_numeric_PID7 = function(df){
  for(i in 1:nrow(df)){
    for(j in 1:ncol(df)){
      df[i,j] = as.numeric(PID7(df[i,j]))
    }
  }
  return(df)}

convert_to_numeric_2 = function(df){
  for (i in 1:ncol(df)){
    df[,i] = sapply(df[,i], as.numeric)
  }
  return(df)
}

income = function(x){
  if(is.na(x)){
    return(NA)
  }
  else if(x==1){
    return(as.numeric(5000))
  }
  else if(x==2){
    return(as.numeric(15000))
  }
  else if(x==3){
    return(as.numeric(25000))
  }
  else if(x==4){
    return(as.numeric(35000))
  }
  else if(x==5){
    return(as.numeric(45000))
  }
  else if(x==6){
    return(as.numeric(55000))
  }
  else if(x==7){
    return(as.numeric(65000))
  }
  else if(x==8){
    return(as.numeric(75000))
  }
  else if(x==9){
    return(as.numeric(90000))
  }
  else if(x==10){
    return(as.numeric(110000))
  }
  else if(x==11){
    return(as.numeric(135000))
  }
  else if(x==31){
    return(as.numeric(175000))
  }
  else{
    return(NA)
  }
}

missingness_as_category = function(df){
  for(i in 1:nrow(df)){
    for(j in 1:ncol(df)){
      if (is.na(df[i,j])){
        df[i,j] = 1
      }
      else {
        df[i,j] = 0
      }
    }
  }
  return(df)
}

wiley_r_squared = function(df,first_wave){
  #Defining relative waves
  y0 = df[,first_wave-1]
  y1 = df[,first_wave]
  y2 = df[,first_wave+1]
  y3 = df[,first_wave+2]
  
  #Computing components of R^2
  cov_y1_y2_squared = cov(y1,y2)^2
  var_y1 = var(y1)
  var_y2 = var(y2)
  var_epsilon1 = var_y1 - (cov(y1,y2)*(cov(y0,y1)/cov(y0,y2)))
  var_epsilon2 = var_y2 - (cov(y2,y3)*(cov(y1,y2)/cov(y1,y3)))
  
  #Computing implied R^2
  r_squared_12 = cov_y1_y2_squared/((var_y1-var_epsilon1)*(var_y2-var_epsilon2))
  
  return(r_squared_12)
}



####Renaming columns
names(vsg)[names(vsg) == 'post_pid7_2012'] <- 'pid7_2012'
names(vsg)[names(vsg) == 'post_pid3_2012'] <- 'pid3_2012'
names(vsg)[names(vsg) == 'pid7_baseline'] <- 'pid7_2011'
names(vsg)[names(vsg) == 'pid3_baseline'] <- 'pid3_2011'



####Creating data set [SUBJECTS WITH ANY NUMBER OF MISSING VALUES]
##Assign a minimum number of missing values each row must have in order to remain in the data set. 0 is equivalent to all cases. 6 is the maximum value applicable.
missing_value_minimum = 0

##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Filtering by rows
#Removing if too few NAs or too many NAs
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))>=missing_value_minimum,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))>=missing_value_minimum,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))<=5,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))<=5,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##Computing statistics
#Correlations
cor_PID7 = round(cor(all_waves_PID7, use="pairwise.complete.obs"),3)
cor_PID3 = round(cor(all_waves_PID3, use="pairwise.complete.obs"),3)

lower.tri(cor_PID7, diag = FALSE)
upper.tri(cor_PID7, diag = FALSE)
lower_7<-cor_PID7
lower_7[lower.tri(cor_PID7, diag=TRUE)]=""
lower_7<-as.data.frame(lower_7)
lower_7
write.csv(lower_7, "./one_plus_missing_cases_PID7_correlations.csv")

lower.tri(cor_PID3, diag = FALSE)
upper.tri(cor_PID3, diag = FALSE)
lower_3<-cor_PID3
lower_3[lower.tri(cor_PID3, diag=TRUE)]=""
lower_3<-as.data.frame(lower_3)
lower_3
write.csv(lower_3, "./one_plus_missing_cases_PID3_correlations.csv")

#Counting the number of observations in each cell
library("psych")
cell_counts_PID7 = pairwiseCount(all_waves_PID7, diagonal=FALSE) #Highest cell count is 2012-16 with 7917. Lowest cell count is 2011-2018 with 4558.
cell_counts_PID3 = pairwiseCount(all_waves_PID3, diagonal=FALSE) #Highest cell count is 2012-16 with 7606. Lowest cell count is 2011-2018 with 4354.

write.csv(cell_counts_PID7, "./cell_counts_PID7_correlations.csv")
write.csv(cell_counts_PID3, "./cell_counts_PID3_correlations.csv")

#Means and SDs (Individuals)
all_waves_PID7$Mean_PID7 = apply(all_waves_PID7[,1:6], 1, mean, na.rm = TRUE)
all_waves_PID7$SD_PID7 = apply(all_waves_PID7[,1:6], 1, sd, na.rm = TRUE)

summary(all_waves_PID7$Mean_PID7)
summary(all_waves_PID7$SD_PID7)

all_waves_PID3$Mean_PID3 = apply(all_waves_PID3[,1:6], 1, mean, na.rm = TRUE)
all_waves_PID3$SD_PID3 = apply(all_waves_PID3[,1:6], 1, sd, na.rm = TRUE)

summary(all_waves_PID3$Mean_PID3)
summary(all_waves_PID3$SD_PID3)

#Means and SDs (Waves)
as.data.frame(round(apply(all_waves_PID7, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID7, 2, sd, na.rm=TRUE),2))

as.data.frame(round(apply(all_waves_PID3, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID3, 2, sd, na.rm=TRUE),2))



####Counting number of complete cases by successive waves
#wave1_PID7 = as.data.frame(vsg[, 'pid7_baseline'])
#wave1_PID3 = as.data.frame(vsg[, 'pid3_baseline'])
#wave1_PID7 = convert_to_numeric_PID7(wave1_PID7)
#wave1_PID3 = convert_to_numeric_PID3(wave1_PID3)
#sum(is.na(wave1_PID7)) #Subtract this number from 9,548
#sum(is.na(wave1_PID3)) #Subtract this number from 9,548

#wave12_PID7 = vsg[, c('post_pid7_2012', 'pid7_baseline')]
#wave12_PID3 = vsg[, c('post_pid3_2012', 'pid3_baseline')]
#wave12_PID7 = convert_to_numeric_PID7(wave12_PID7)
#wave12_PID3 = convert_to_numeric_PID3(wave12_PID3)
#wave12_PID7 = wave12_PID7[rowSums(is.na(wave12_PID7))==0,]
#wave12_PID3 = wave12_PID3[rowSums(is.na(wave12_PID3))==0,]

#wave13_PID7 = vsg[, c('pid7_2016','post_pid7_2012', 'pid7_baseline')]
#wave13_PID3 = vsg[, c('pid3_2016','post_pid3_2012', 'pid3_baseline')]
#wave13_PID7 = convert_to_numeric_PID7(wave13_PID7)
#wave13_PID3 = convert_to_numeric_PID3(wave13_PID3)
#wave13_PID7 = wave13_PID7[rowSums(is.na(wave13_PID7))==0,]
#wave13_PID3 = wave13_PID3[rowSums(is.na(wave13_PID3))==0,]

#wave14_PID7 = vsg[, c('pid7_2017','pid7_2016','post_pid7_2012', 'pid7_baseline')]
#wave14_PID3 = vsg[, c('pid3_2017','pid3_2016','post_pid3_2012', 'pid3_baseline')]
#wave14_PID7 = convert_to_numeric_PID7(wave14_PID7)
#wave14_PID3 = convert_to_numeric_PID3(wave14_PID3)
#wave14_PID7 = wave14_PID7[rowSums(is.na(wave14_PID7))==0,]
#wave14_PID3 = wave14_PID3[rowSums(is.na(wave14_PID3))==0,]

#wave15_PID7 = vsg[, c('pid7_2018','pid7_2017','pid7_2016','post_pid7_2012', 'pid7_baseline')]
#wave15_PID3 = vsg[, c('pid3_2018','pid3_2017','pid3_2016','post_pid3_2012', 'pid3_baseline')]
#wave15_PID7 = convert_to_numeric_PID7(wave15_PID7)
#wave15_PID3 = convert_to_numeric_PID3(wave15_PID3)
#wave15_PID7 = wave15_PID7[rowSums(is.na(wave15_PID7))==0,]
#wave15_PID3 = wave15_PID3[rowSums(is.na(wave15_PID3))==0,]

#wave16_PID7 = vsg[, c('pid7_2019','pid7_2018','pid7_2017','pid7_2016','post_pid7_2012', 'pid7_baseline')]
#wave16_PID3 = vsg[, c('pid3_2019','pid3_2018','pid3_2017','pid3_2016','post_pid3_2012', 'pid3_baseline')]
#wave16_PID7 = convert_to_numeric_PID7(wave16_PID7)
#wave16_PID3 = convert_to_numeric_PID3(wave16_PID3)
#wave16_PID7 = wave16_PID7[rowSums(is.na(wave16_PID7))==0,]
#wave16_PID3 = wave16_PID3[rowSums(is.na(wave16_PID3))==0,]



####Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Filtering by rows
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))==0,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))==0,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##Computing statistics
#Correlations
cor_PID7_listwise = round(cor(all_waves_PID7),3)
cor_PID3_listwise = round(cor(all_waves_PID3),3)

lower.tri(cor_PID7_listwise, diag = FALSE)
upper.tri(cor_PID7_listwise, diag = FALSE)
lower_7_listwise<-cor_PID7_listwise
lower_7_listwise[lower.tri(cor_PID7_listwise, diag=TRUE)]=""
lower_7_listwise<-as.data.frame(lower_7_listwise)
lower_7_listwise
write.csv(lower_7, "./complete_cases_PID7_correlations.csv")

lower.tri(cor_PID3_listwise, diag = FALSE)
upper.tri(cor_PID3_listwise, diag = FALSE)
lower_3_listwise<-cor_PID3_listwise
lower_3_listwise[lower.tri(cor_PID3_listwise, diag=TRUE)]=""
lower_3_listwise<-as.data.frame(lower_3_listwise)
lower_3_listwise
write.csv(lower_3, "./complete_cases_PID3_correlations.csv")

#Counting the number of observations in each cell
library("psych")
pairwiseCount(all_waves_PID7)
pairwiseCount(all_waves_PID3)

######
#To create the final correlation matrices for presentation, combine the pairwise correlations for the total cases (missing_value_minimum = 0) with the correlations for the complete case dataframes (which is equivalent to listwise deletion) to match the four matrices you have in the "Total Cases Analysis" tab and the "Complete Cases Analysis" tab
#PID7 Correlation Matrix (pairwise is lower diagonal, listwise is upper diagonal)
lower.tri(cor_PID7_listwise, diag = FALSE)
lower.tri(cor_PID7, diag = FALSE)
lower_7_listwise<-cor_PID7_listwise
lower_7<-cor_PID7

for (i in 1:nrow(lower_7_listwise)){
  for (j in 1:ncol(lower_7_listwise)){
    if (i>j){
      lower_7_listwise[i,j] = lower_7[i,j]
    }
    if (lower_7_listwise[i,j] == 1.0){
      lower_7_listwise[i,j] = "**"
    }
  }
}

lower_7_listwise<-as.data.frame(lower_7_listwise)
lower_7_listwise
write.csv(lower_7_listwise, "./final_PID7_correlations.csv")

#PID3 Correlation Matrix (pairwise is lower diagonal, listwise is upper diagonal)
lower.tri(cor_PID3_listwise, diag = FALSE)
lower.tri(cor_PID3, diag = FALSE)
lower_3_listwise<-cor_PID3_listwise
lower_3<-cor_PID3

for (i in 1:nrow(lower_3_listwise)){
  for (j in 1:ncol(lower_3_listwise)){
    if (i>j){
      lower_3_listwise[i,j] = lower_3[i,j]
    }
    if (lower_3_listwise[i,j] == 1.0){
      lower_3_listwise[i,j] = "**"
    }
  }
}

lower_3_listwise<-as.data.frame(lower_3_listwise)
lower_3_listwise
write.csv(lower_3_listwise, "./final_PID3_correlations.csv")
######

#Means and SDs (Individuals)
all_waves_PID7$Mean_PID7 = apply(all_waves_PID7[,1:6], 1, mean, na.rm = TRUE)
all_waves_PID7$SD_PID7 = apply(all_waves_PID7[,1:6], 1, sd, na.rm = TRUE)

summary(all_waves_PID7$Mean_PID7)
summary(all_waves_PID7$SD_PID7)

all_waves_PID3$Mean_PID3 = apply(all_waves_PID3[,1:6], 1, mean, na.rm = TRUE)
all_waves_PID3$SD_PID3 = apply(all_waves_PID3[,1:6], 1, sd, na.rm = TRUE)

summary(all_waves_PID3$Mean_PID3)
summary(all_waves_PID3$SD_PID3)

#Means and SDs (Waves)
as.data.frame(round(apply(all_waves_PID7, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID7, 2, sd, na.rm=TRUE),2))

as.data.frame(round(apply(all_waves_PID3, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID3, 2, sd, na.rm=TRUE),2))



####Time Series and IV Regressions
###Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Filtering by rows
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))==0,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))==0,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##Creating multiple dataframes
complete_cases_11_12_16_PID7 = all_waves_PID7[, c('pid7_2011','pid7_2012','pid7_2016')]
complete_cases_12_16_17_PID7 = all_waves_PID7[, c('pid7_2012','pid7_2016','pid7_2017')]
complete_cases_16_17_18_PID7 = all_waves_PID7[, c('pid7_2016','pid7_2017','pid7_2018')]
complete_cases_17_18_19_PID7 = all_waves_PID7[, c('pid7_2017','pid7_2018','pid7_2019')]

names(complete_cases_11_12_16_PID7)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_12_16_17_PID7)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_16_17_18_PID7)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_17_18_19_PID7)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')

complete_cases_11_12_16_PID3 = all_waves_PID3[, c('pid3_2011','pid3_2012','pid3_2016')]
complete_cases_12_16_17_PID3 = all_waves_PID3[, c('pid3_2012','pid3_2016','pid3_2017')]
complete_cases_16_17_18_PID3 = all_waves_PID3[, c('pid3_2016','pid3_2017','pid3_2018')]
complete_cases_17_18_19_PID3 = all_waves_PID3[, c('pid3_2017','pid3_2018','pid3_2019')]

names(complete_cases_11_12_16_PID3)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_12_16_17_PID3)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_16_17_18_PID3)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_17_18_19_PID3)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')

##Creating combined dataframes
complete_cases_rbind_PID7 = rbind(complete_cases_11_12_16_PID7,complete_cases_12_16_17_PID7,complete_cases_16_17_18_PID7,complete_cases_17_18_19_PID7)
complete_cases_rbind_PID3 = rbind(complete_cases_11_12_16_PID3,complete_cases_12_16_17_PID3,complete_cases_16_17_18_PID3,complete_cases_17_18_19_PID3)

##Adding a wave column
w1_PID7 = as.data.frame(rep('2011',nrow(complete_cases_11_12_16_PID7)))
names(w1_PID7)[1] <- 'wave'
w2_PID7 = as.data.frame(rep('2012',nrow(complete_cases_12_16_17_PID7)))
names(w2_PID7)[1] <- 'wave'
w3_PID7 = as.data.frame(rep('2016',nrow(complete_cases_16_17_18_PID7)))
names(w3_PID7)[1] <- 'wave'
w4_PID7 = as.data.frame(rep('2017',nrow(complete_cases_17_18_19_PID7)))
names(w4_PID7)[1] <- 'wave'

w1_PID3 = as.data.frame(rep('2011',nrow(complete_cases_11_12_16_PID3)))
names(w1_PID3)[1] <- 'wave'
w2_PID3 = as.data.frame(rep('2012',nrow(complete_cases_12_16_17_PID3)))
names(w2_PID3)[1] <- 'wave'
w3_PID3 = as.data.frame(rep('2016',nrow(complete_cases_16_17_18_PID3)))
names(w3_PID3)[1] <- 'wave'
w4_PID3 = as.data.frame(rep('2017',nrow(complete_cases_17_18_19_PID3)))
names(w4_PID3)[1] <- 'wave'

complete_cases_rbind_PID7_waves = rbind(w1_PID7,w2_PID7,w3_PID7,w4_PID7)
complete_cases_rbind_PID7 = cbind(complete_cases_rbind_PID7,complete_cases_rbind_PID7_waves)
summary(complete_cases_rbind_PID7)

complete_cases_rbind_PID3_waves = rbind(w1_PID3,w2_PID3,w3_PID3,w4_PID3)
complete_cases_rbind_PID3 = cbind(complete_cases_rbind_PID3,complete_cases_rbind_PID3_waves)
summary(complete_cases_rbind_PID3)

##Time Series Regressions
summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_rbind_PID7))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_rbind_PID7))
#summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0 + wave, data=complete_cases_rbind_PID7))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_11_12_16_PID7))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_11_12_16_PID7))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_12_16_17_PID7))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_12_16_17_PID7))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_16_17_18_PID7))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_16_17_18_PID7))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_17_18_19_PID7))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_17_18_19_PID7))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_rbind_PID3))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_rbind_PID3))
#summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0 + wave, data=complete_cases_rbind_PID3))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_11_12_16_PID3))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_11_12_16_PID3))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_12_16_17_PID3))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_12_16_17_PID3))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_16_17_18_PID3))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_16_17_18_PID3))

summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_17_18_19_PID3))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_17_18_19_PID3))

##IV Regressions
library('ivreg')
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_rbind_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_11_12_16_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_12_16_17_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_16_17_18_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_17_18_19_PID7))

summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_rbind_PID3))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_11_12_16_PID3))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_12_16_17_PID3))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_16_17_18_PID3))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_17_18_19_PID3))



####Replicating Ansolabehere, Rodden, and Snyder (2008) by taking simple averages
###Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7_pairs = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3_pairs = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7_pairs = convert_to_numeric_PID7(all_waves_PID7_pairs)
all_waves_PID3_pairs = convert_to_numeric_PID3(all_waves_PID3_pairs)

##Filtering by rows
all_waves_PID7_pairs = all_waves_PID7_pairs[rowSums(is.na(all_waves_PID7_pairs))==0,]
all_waves_PID3_pairs = all_waves_PID3_pairs[rowSums(is.na(all_waves_PID3_pairs))==0,]

summary(rowSums(is.na(all_waves_PID7_pairs))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3_pairs))) #Checking that the correct type of cases remain

##Creating dataframes
all_waves_PID7_pairs$pid7_2011_2012 = apply(all_waves_PID7_pairs[,1:2], 1, mean, na.rm = TRUE)
all_waves_PID7_pairs$pid7_2016_2017 = apply(all_waves_PID7_pairs[,3:4], 1, mean, na.rm = TRUE)
all_waves_PID7_pairs$pid7_2018_2019 = apply(all_waves_PID7_pairs[,5:6], 1, mean, na.rm = TRUE)
PID7_pairs = all_waves_PID7_pairs[,7:9]

all_waves_PID3_pairs$pid3_2011_2012 = apply(all_waves_PID3_pairs[,1:2], 1, mean, na.rm = TRUE)
all_waves_PID3_pairs$pid3_2016_2017 = apply(all_waves_PID3_pairs[,3:4], 1, mean, na.rm = TRUE)
all_waves_PID3_pairs$pid3_2018_2019 = apply(all_waves_PID3_pairs[,5:6], 1, mean, na.rm = TRUE)
PID3_pairs = all_waves_PID3_pairs[,7:9]

##Computing statistics
#Correlations
cor_PID7 = round(cor(PID7_pairs, use="pairwise.complete.obs"),3)
cor_PID3 = round(cor(PID3_pairs, use="pairwise.complete.obs"),3)

lower.tri(cor_PID7, diag = FALSE)
upper.tri(cor_PID7, diag = FALSE)
lower_7<-cor_PID7
lower_7[lower.tri(cor_PID7, diag=TRUE)]=""
lower_7<-as.data.frame(lower_7)
lower_7
write.csv(lower_7, "./complete_cases_PID7_pairs_correlations.csv")

lower.tri(cor_PID3, diag = FALSE)
upper.tri(cor_PID3, diag = FALSE)
lower_3<-cor_PID3
lower_3[lower.tri(cor_PID3, diag=TRUE)]=""
lower_3<-as.data.frame(lower_3)
lower_3
write.csv(lower_3, "./complete_cases_PID3_pairs_correlations.csv")

#Time Series Regressions
summary(lm(pid7_2018_2019 ~ pid7_2016_2017, data=PID7_pairs))
summary(lm(pid7_2018_2019 ~ pid7_2016_2017 + pid7_2011_2012, data=PID7_pairs))

summary(lm(pid3_2018_2019 ~ pid3_2016_2017, data=PID3_pairs))
summary(lm(pid3_2018_2019 ~ pid3_2016_2017 + pid3_2011_2012, data=PID3_pairs))

#IV Regressions
library('ivreg')
summary(ivreg(pid7_2018_2019 ~ pid7_2016_2017, ~ pid7_2011_2012, data=PID7_pairs))

summary(ivreg(pid3_2018_2019 ~ pid3_2016_2017, ~ pid3_2011_2012, data=PID3_pairs))



####Alternative PID3 (leaners classified as partisans rather than Independents)
vsg_alt_PID3 = vsg

vsg_alt_PID3$pid3_2011_alt = ifelse(vsg_alt_PID3$pid7_2011==3,1,ifelse(vsg_alt_PID3$pid7_2011==5,2,vsg_alt_PID3$pid3_2011))
table(vsg_alt_PID3$pid3_2011_alt)
table(vsg_alt_PID3$pid3_2011)
table(vsg_alt_PID3$pid7_2011)
table(vsg_alt_PID3$pid3_2011_alt,vsg_alt_PID3$pid3_2011)

vsg_alt_PID3$pid3_2012_alt = ifelse(vsg_alt_PID3$pid7_2012==3,1,ifelse(vsg_alt_PID3$pid7_2012==5,2,vsg_alt_PID3$pid3_2012))
table(vsg_alt_PID3$pid3_2012_alt)
table(vsg_alt_PID3$pid3_2012)
table(vsg_alt_PID3$pid7_2012)
table(vsg_alt_PID3$pid3_2012_alt,vsg_alt_PID3$pid3_2012)

vsg_alt_PID3$pid3_2016_alt = ifelse(vsg_alt_PID3$pid7_2016==3,1,ifelse(vsg_alt_PID3$pid7_2016==5,2,vsg_alt_PID3$pid3_2016))
table(vsg_alt_PID3$pid3_2016_alt)
table(vsg_alt_PID3$pid3_2016)
table(vsg_alt_PID3$pid7_2016)
table(vsg_alt_PID3$pid3_2016_alt,vsg_alt_PID3$pid3_2016)

vsg_alt_PID3$pid3_2017_alt = ifelse(vsg_alt_PID3$pid7_2017==3,1,ifelse(vsg_alt_PID3$pid7_2017==5,2,vsg_alt_PID3$pid3_2017))
table(vsg_alt_PID3$pid3_2017_alt)
table(vsg_alt_PID3$pid3_2017)
table(vsg_alt_PID3$pid7_2017)
table(vsg_alt_PID3$pid3_2017_alt,vsg_alt_PID3$pid3_2017)

vsg_alt_PID3$pid3_2018_alt = ifelse(vsg_alt_PID3$pid7_2018==3,1,ifelse(vsg_alt_PID3$pid7_2018==5,2,vsg_alt_PID3$pid3_2018))
table(vsg_alt_PID3$pid3_2018_alt)
table(vsg_alt_PID3$pid3_2018)
table(vsg_alt_PID3$pid7_2018)
table(vsg_alt_PID3$pid3_2018_alt,vsg_alt_PID3$pid3_2018)

vsg_alt_PID3$pid3_2019_alt = ifelse(vsg_alt_PID3$pid7_2019==3,1,ifelse(vsg_alt_PID3$pid7_2019==5,2,vsg_alt_PID3$pid3_2019))
table(vsg_alt_PID3$pid3_2019_alt)
table(vsg_alt_PID3$pid3_2019)
table(vsg_alt_PID3$pid7_2019)
table(vsg_alt_PID3$pid3_2019_alt,vsg_alt_PID3$pid3_2019)

##Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID3_alt = vsg_alt_PID3[, c('pid3_2011_alt','pid3_2012_alt','pid3_2016_alt','pid3_2017_alt','pid3_2018_alt','pid3_2019_alt')]

##Converting values to new scale and formatting as numeric
all_waves_PID3_alt = convert_to_numeric_PID3(all_waves_PID3_alt)

##Filtering by rows
all_waves_PID3_alt = all_waves_PID3_alt[rowSums(is.na(all_waves_PID3_alt))==0,]

summary(rowSums(is.na(all_waves_PID3_alt))) #Checking that the correct type of cases remain

##Computing statistics
#Correlations
cor_PID3 = round(cor(all_waves_PID3_alt),3)

lower.tri(cor_PID3, diag = FALSE)
upper.tri(cor_PID3, diag = FALSE)
lower_3<-cor_PID3
lower_3[lower.tri(cor_PID3, diag=TRUE)]=""
lower_3<-as.data.frame(lower_3)
lower_3
write.csv(lower_3, "./complete_cases_PID3_alternative_correlations.csv")

#Means and SDs (Individuals)
all_waves_PID3_alt$Mean_PID3_alt = apply(all_waves_PID3_alt[,1:6], 1, mean, na.rm = TRUE)
all_waves_PID3_alt$SD_PID3_alt = apply(all_waves_PID3_alt[,1:6], 1, sd, na.rm = TRUE)

summary(all_waves_PID3_alt$Mean_PID3_alt)
summary(all_waves_PID3_alt$SD_PID3_alt)

#Means and SDs (Waves)
as.data.frame(round(apply(all_waves_PID3_alt, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID3_alt, 2, sd, na.rm=TRUE),2))

##Time Series and IV Regressions
#Creating multiple dataframes
complete_cases_11_12_16_PID3_alt = all_waves_PID3_alt[, c('pid3_2011_alt','pid3_2012_alt','pid3_2016_alt')]
complete_cases_12_16_17_PID3_alt = all_waves_PID3_alt[, c('pid3_2012_alt','pid3_2016_alt','pid3_2017_alt')]
complete_cases_16_17_18_PID3_alt = all_waves_PID3_alt[, c('pid3_2016_alt','pid3_2017_alt','pid3_2018_alt')]
complete_cases_17_18_19_PID3_alt = all_waves_PID3_alt[, c('pid3_2017_alt','pid3_2018_alt','pid3_2019_alt')]

names(complete_cases_11_12_16_PID3_alt)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_12_16_17_PID3_alt)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_16_17_18_PID3_alt)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')
names(complete_cases_17_18_19_PID3_alt)[1:3] <- c('PID_t_0','PID_t_1','PID_t_2')

#Creating combined dataframes
complete_cases_rbind_PID3_alt = rbind(complete_cases_11_12_16_PID3_alt,complete_cases_12_16_17_PID3_alt,complete_cases_16_17_18_PID3_alt,complete_cases_17_18_19_PID3_alt)

#Adding a wave column
w1_PID3_alt = as.data.frame(rep('2011',nrow(complete_cases_11_12_16_PID3_alt)))
names(w1_PID3_alt)[1] <- 'wave'
w2_PID3_alt = as.data.frame(rep('2012',nrow(complete_cases_12_16_17_PID3_alt)))
names(w2_PID3_alt)[1] <- 'wave'
w3_PID3_alt = as.data.frame(rep('2016',nrow(complete_cases_16_17_18_PID3_alt)))
names(w3_PID3_alt)[1] <- 'wave'
w4_PID3_alt = as.data.frame(rep('2017',nrow(complete_cases_17_18_19_PID3_alt)))
names(w4_PID3_alt)[1] <- 'wave'

complete_cases_rbind_PID3_waves_alt = rbind(w1_PID3_alt,w2_PID3_alt,w3_PID3_alt,w4_PID3_alt)
complete_cases_rbind_PID3_alt = cbind(complete_cases_rbind_PID3_alt,complete_cases_rbind_PID3_waves_alt)
summary(complete_cases_rbind_PID3_alt)

#Time Series Regressions
summary(lm(PID_t_2 ~ PID_t_1, data=complete_cases_rbind_PID3_alt))
summary(lm(PID_t_2 ~ PID_t_1 + PID_t_0, data=complete_cases_rbind_PID3_alt))

#IV Regression
library('ivreg')
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_rbind_PID3_alt))



####Estimating Drift Respondent-Wise [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Filtering by rows
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))==0,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))==0,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##Determining how many respondents have identical values throughout the six-wave time series
identical_PID7 = rep(NA,nrow(all_waves_PID7))
for (i in 1:nrow(all_waves_PID7)){
  num = length(unique(as.list(all_waves_PID7[i,])))
  if(num==1){
    identical_PID7[i] = 1
  }
  else{
    identical_PID7[i] = 0
  }
}

sum(identical_PID7)/4013 #0.533

identical_PID3 = rep(NA,nrow(all_waves_PID3))
for (i in 1:nrow(all_waves_PID3)){
  num = length(unique(as.list(all_waves_PID3[i,])))
  if(num==1){
    identical_PID3[i] = 1
  }
  else{
    identical_PID3[i] = 0
  }
}

sum(identical_PID3)/3768 #0.795

##Creating and appending unique ID column
unique_ID_PID7 = as.data.frame(seq(1,nrow(all_waves_PID7),1))
all_waves_PID7 = cbind(unique_ID_PID7,all_waves_PID7)
names(all_waves_PID7)[1] = 'Unique ID'

unique_ID_PID3 = as.data.frame(seq(1,nrow(all_waves_PID3),1))
all_waves_PID3 = cbind(unique_ID_PID3,all_waves_PID3)
names(all_waves_PID3)[1] = 'Unique ID'

##Creating long form dataframes
library(tidyr)
all_waves_PID_7_long = gather(all_waves_PID7,PID_wave,PID_7,pid7_2011:pid7_2019,factor_key=TRUE)
all_waves_PID_7_long = all_waves_PID_7_long[order(all_waves_PID_7_long$`Unique ID`),]

all_waves_PID_3_long = gather(all_waves_PID3,PID_wave,PID_3,pid3_2011:pid3_2019,factor_key=TRUE)
all_waves_PID_3_long = all_waves_PID_3_long[order(all_waves_PID_3_long$`Unique ID`),]

##Adding wave columns
waves_PID7 = as.data.frame(cbind(rep(c(0,1,2,3,4,5),nrow(all_waves_PID7)),rep(c(0,0.917,5,5.583,6.333,7),nrow(all_waves_PID7))))
names(waves_PID7) <- c('wave_counter','years_since_origin')

waves_PID3 = as.data.frame(cbind(rep(c(0,1,2,3,4,5),nrow(all_waves_PID3)),rep(c(0,0.917,5,5.583,6.333,7),nrow(all_waves_PID3))))
names(waves_PID3) <- c('wave_counter','years_since_origin')

all_waves_PID_7_long = cbind(all_waves_PID_7_long,waves_PID7)
all_waves_PID_3_long = cbind(all_waves_PID_3_long,waves_PID3)

##OLS Regressions
#wave_counter
PID_7_drift_coef = c()
PID_7_drift_p_value = c()
UID_set_PID_7 = c()
for (i in all_waves_PID_7_long$`Unique ID`){
  if (is.element(i,UID_set_PID_7)){
    next
  }
  else{
    df = subset(all_waves_PID_7_long, `Unique ID`==i)
    lm = lm(PID_7 ~ wave_counter, data=df)
    coef = summary(lm)$coefficients[2,1]
    coef2 = summary(lm)$coefficients[2,4]
    PID_7_drift_coef = c(PID_7_drift_coef, coef)
    PID_7_drift_p_value = c(PID_7_drift_p_value,coef2)
    UID_set_PID_7 = c(UID_set_PID_7,i)
  }
}

PID_3_drift_coef = c()
PID_3_drift_p_value = c()
UID_set_PID_3 = c()
for (i in all_waves_PID_3_long$`Unique ID`){
  if (is.element(i,UID_set_PID_3)){
    next
  }
  else{
    df = subset(all_waves_PID_3_long, `Unique ID`==i)
    lm = lm(PID_3 ~ wave_counter, data=df)
    coef = summary(lm)$coefficients[2,1]
    coef2 = summary(lm)$coefficients[2,4]
    PID_3_drift_coef = c(PID_3_drift_coef, coef)
    PID_3_drift_p_value = c(PID_3_drift_p_value,coef2)
    UID_set_PID_3 = c(UID_set_PID_3,i)
  }
}

hist(PID_7_drift_coef, breaks=50)
summary(PID_7_drift_coef)
threshold_val = 1/5 #This is the coefficient value required to drift one-full point over the full time-series.
counter_threshold_PID7 = 0
for(i in PID_7_drift_coef){
  if(is.nan(i)){
    next
  }
  else{
    if(abs(i>=threshold_val)){
      counter_threshold_PID7 = counter_threshold_PID7+1
    }
  }
}

counter_threshold_PID7/nrow(all_waves_PID7) #.125

hist(PID_7_drift_p_value, breaks=50)
summary(PID_7_drift_p_value)
counter_PID7 = 0
for(i in PID_7_drift_p_value){
  if(is.nan(i)){
    next
  }
  else{
    if(i<0.05){
      counter_PID7 = counter_PID7+1
    }
  }
}

hist(PID_3_drift_coef, breaks=50)
summary(PID_3_drift_coef)
threshold_val = 1/5 #This is the coefficient value required to drift one-full point over the full time-series.
counter_threshold_PID3 = 0
for(i in PID_3_drift_coef){
  if(is.nan(i)){
    next
  }
  else{
    if(abs(i>=threshold_val)){
      counter_threshold_PID3 = counter_threshold_PID3+1
    }
  }
}

counter_threshold_PID3/nrow(all_waves_PID3) #.047

hist(PID_3_drift_p_value, breaks=50)
summary(PID_3_drift_p_value)
counter_PID3 = 0
for(i in PID_3_drift_p_value){
  if(is.nan(i)){
    next
  }
  else{
    if(i<0.05){
      counter_PID3 = counter_PID3+1
    }
  }
}

#Counting respondents whose coefficients are exactly 0
sum(ifelse(PID_7_drift_coef==0,1,0))/nrow(all_waves_PID7) #0.063
sum(ifelse(PID_3_drift_coef==0,1,0))/nrow(all_waves_PID3) #0.242
#THESE NUMBERS ARE CORRECT BUT MISLEADING. THEY EXCLUDE VALUES INFINITESIMALLY DIFFERENT FROM ZERO. USE THE RESULTS CALCULATED ABOVE INSTEAD.

#years_since_origin
PID_7_drift_coef = c()
PID_7_drift_p_value = c()
UID_set_PID_7 = c()
for (i in all_waves_PID_7_long$`Unique ID`){
  if (is.element(i,UID_set_PID_7)){
    next
  }
  else{
    df = subset(all_waves_PID_7_long, `Unique ID`==i)
    lm = lm(PID_7 ~ years_since_origin, data=df)
    coef = summary(lm)$coefficients[2,1]
    coef2 = summary(lm)$coefficients[2,4]
    PID_7_drift_coef = c(PID_7_drift_coef, coef)
    PID_7_drift_p_value = c(PID_7_drift_p_value,coef2)
    UID_set_PID_7 = c(UID_set_PID_7,i)
  }
}

PID_3_drift_coef = c()
PID_3_drift_p_value = c()
UID_set_PID_3 = c()
for (i in all_waves_PID_3_long$`Unique ID`){
  if (is.element(i,UID_set_PID_3)){
    next
  }
  else{
    df = subset(all_waves_PID_3_long, `Unique ID`==i)
    lm = lm(PID_3 ~ years_since_origin, data=df)
    coef = summary(lm)$coefficients[2,1]
    coef2 = summary(lm)$coefficients[2,4]
    PID_3_drift_coef = c(PID_3_drift_coef, coef)
    PID_3_drift_p_value = c(PID_3_drift_p_value,coef2)
    UID_set_PID_3 = c(UID_set_PID_3,i)
  }
}

hist(PID_7_drift_coef, breaks=50)
summary(PID_7_drift_coef)
threshold_val = 1/7 #This is the coefficient value required to drift one-full point over the full time-series.
counter_threshold_PID7 = 0
for(i in PID_7_drift_coef){
  if(is.nan(i)){
    next
  }
  else{
    if(abs(i>=threshold_val)){
      counter_threshold_PID7 = counter_threshold_PID7+1
    }
  }
}

counter_threshold_PID7/nrow(all_waves_PID7) #.120

hist(PID_7_drift_p_value, breaks=50)
summary(PID_7_drift_p_value)
counter_PID7 = 0
for(i in PID_7_drift_p_value){
  if(is.nan(i)){
    next
  }
  else{
    if(i<0.05){
      counter_PID7 = counter_PID7+1
    }
  }
}

hist(PID_3_drift_coef, breaks=50)
summary(PID_3_drift_coef)
threshold_val = 1/7 #This is the coefficient value required to drift one-full point over the full time-series.
counter_threshold_PID3 = 0
for(i in PID_3_drift_coef){
  if(is.nan(i)){
    next
  }
  else{
    if(abs(i>=threshold_val)){
      counter_threshold_PID3 = counter_threshold_PID3+1
    }
  }
}

counter_threshold_PID3/nrow(all_waves_PID3) #.043

hist(PID_3_drift_p_value, breaks=50)
summary(PID_3_drift_p_value)
counter_PID3 = 0
for(i in PID_3_drift_p_value){
  if(is.nan(i)){
    next
  }
  else{
    if(i<0.05){
      counter_PID3 = counter_PID3+1
    }
  }
}

#Counting respondents whose coefficients are exactly 0
sum(ifelse(PID_7_drift_coef==0,1,0))/nrow(all_waves_PID7) #0.061
sum(ifelse(PID_3_drift_coef==0,1,0))/nrow(all_waves_PID3) #0.240
#THESE NUMBERS ARE CORRECT BUT MISLEADING. THEY EXCLUDE VALUES INFINITESIMALLY DIFFERENT FROM ZERO. USE THE RESULTS CALCULATED ABOVE INSTEAD.

#Visualizations for paper
library(ggplot2)
PID_7_drift_coef_df = as.data.frame(PID_7_drift_coef)
PID_3_drift_coef_df = as.data.frame(PID_3_drift_coef)

ggplot(PID_7_drift_coef_df, aes(x=PID_7_drift_coef)) +
  geom_histogram(color="black", binwidth=0.025) + 
  ggtitle("") + 
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) +
  xlab("OLS Coefficients (Slopes)") +
  ylab("Frequency") +
  xlim(-1,1)

#Saving in high resolution
ggsave(filename = "./VSG_Trajectories_Wave Count_Narrow Bins_090922.png", height = 5.7, width = 15.29, units = c("cm"), device='png', dpi=600)



####Predicting Individual-level Standard Deviations [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Creating and appending unique ID column
unique_ID_PID7 = as.data.frame(seq(1,nrow(all_waves_PID7),1))
all_waves_PID7 = cbind(unique_ID_PID7,all_waves_PID7)
names(all_waves_PID7)[1] = 'Unique ID'

unique_ID_PID3 = as.data.frame(seq(1,nrow(all_waves_PID3),1))
all_waves_PID3 = cbind(unique_ID_PID3,all_waves_PID3)
names(all_waves_PID3)[1] = 'Unique ID'

##Filtering by rows
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))==0,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))==0,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##Adding Means and SDs (Individuals)
all_waves_PID7$Mean_PID7 = apply(all_waves_PID7[,2:7], 1, mean)
all_waves_PID7$SD_PID7 = apply(all_waves_PID7[,2:7], 1, sd)

all_waves_PID3$Mean_PID3 = apply(all_waves_PID3[,2:7], 1, mean)
all_waves_PID3$SD_PID3 = apply(all_waves_PID3[,2:7], 1, sd)

##Adding 2011 demographic predictors
demographics_2011 = vsg[, c('gender_baseline','region_baseline','faminc_baseline','race_baseline','birthyr_baseline','educ_baseline')]
demographics_2011 = cbind(unique_ID_PID3,demographics_2011)
names(demographics_2011)[1] = 'Unique ID'
summary(demographics_2011)

demographics_2011$gender_baseline = as.factor(ifelse(demographics_2011$gender_baseline==1,"Male",ifelse(demographics_2011$gender_baseline==2,"Female",NA)))
demographics_2011$gender_baseline = relevel(demographics_2011$gender_baseline, ref = "Male")

demographics_2011$region_baseline = ifelse(is.na(demographics_2011$region_baseline),'Missing',demographics_2011$region_baseline)
demographics_2011$region_baseline = as.factor(ifelse(demographics_2011$region_baseline==1,"Northeast",ifelse(demographics_2011$region_baseline==2,"Midwest",ifelse(demographics_2011$region_baseline==3,"South",ifelse(demographics_2011$region_baseline==4,"West","Missing")))))
demographics_2011$region_baseline = relevel(demographics_2011$region_baseline, ref = "Northeast")

demographics_2011$faminc_baseline = sapply(demographics_2011$faminc_baseline, income)
demographics_2011$faminc_baseline_thousands = demographics_2011$faminc_baseline/1000 

demographics_2011$race_baseline = as.factor(ifelse(demographics_2011$race_baseline==1,"White",ifelse(demographics_2011$race_baseline==2,"Black",ifelse(demographics_2011$race_baseline==3,"Hispanic",ifelse(demographics_2011$race_baseline==4,"Other",ifelse(demographics_2011$race_baseline==5,"Other",ifelse(demographics_2011$race_baseline==6,"2+ Races",ifelse(demographics_2011$race_baseline==7,"Other",ifelse(demographics_2011$race_baseline==8,"Other",NA)))))))))
demographics_2011$race_baseline = relevel(demographics_2011$race_baseline, ref = "White")

demographics_2011$birthyr_baseline = ifelse(demographics_2011$birthyr_baseline>=9000,NA,demographics_2011$birthyr_baseline)
demographics_2011$age = as.numeric(2011-demographics_2011$birthyr_baseline)

demographics_2011$educ_baseline = as.factor(ifelse(demographics_2011$educ_baseline==1,"No HS",ifelse(demographics_2011$educ_baseline==2,"High school",ifelse(demographics_2011$educ_baseline==3,"Some college",ifelse(demographics_2011$educ_baseline==4,"Some college",ifelse(demographics_2011$educ_baseline==5,"Bachelor's degree+",ifelse(demographics_2011$educ_baseline==6,"Bachelor's degree+",NA)))))))
demographics_2011$educ_baseline = relevel(demographics_2011$educ_baseline, ref = "No HS")

summary(demographics_2011)

##Joining PID dataframes and demographic dataframes
all_waves_PID7 = merge(all_waves_PID7,demographics_2011, by="Unique ID", all.x=TRUE)
all_waves_PID3 = merge(all_waves_PID3,demographics_2011, by="Unique ID", all.x=TRUE)

##Creating two more predictors
all_waves_PID7$partisan_intensity_2011 = ifelse(all_waves_PID7$pid7_2011==3|all_waves_PID7$pid7_2011==-3,2,ifelse(all_waves_PID7$pid7_2011==2|all_waves_PID7$pid7_2011==-2,1,ifelse(all_waves_PID7$pid7_2011==1|all_waves_PID7$pid7_2011==0|all_waves_PID7$pid7_2011==-1,0,NA)))
table(all_waves_PID7$partisan_intensity_2011,all_waves_PID7$pid7_2011)

all_waves_PID7$democrat_2011 = ifelse(all_waves_PID7$pid7_2011<0,1,ifelse(all_waves_PID7$pid7_2011>0,0,ifelse(all_waves_PID7$pid7_2011==0,0.5,NA)))
table(all_waves_PID7$democrat_2011,all_waves_PID7$pid7_2011)

##OLS regression equations
summary(lm(SD_PID7 ~ gender_baseline, data = all_waves_PID7)) #Gender is not associated with standard deviation
summary(lm(SD_PID7 ~ faminc_baseline_thousands, data = all_waves_PID7)) #Income is negatively associated with standard deviation
summary(lm(SD_PID7 ~ region_baseline, data = all_waves_PID7)) #Western region is negatively associated with standard deviation compared to Northeastern region
summary(lm(SD_PID7 ~ race_baseline, data = all_waves_PID7)) #Black race is negatively associated with standard deviation as compared to White race
summary(lm(SD_PID7 ~ age, data = all_waves_PID7)) #Age is not associated with standard deviation
summary(lm(SD_PID7 ~ educ_baseline, data = all_waves_PID7)) #Education is strongly negatively associated with standard deviation
summary(lm(SD_PID7 ~ partisan_intensity_2011, data = all_waves_PID7)) #Partisan intensity is strongly negatively associated with standard deviation
summary(lm(SD_PID7 ~ democrat_2011, data = all_waves_PID7)) #Partisanship is not associated with standard deviation

summary(all_waves_PID7$SD_PID7)
summary(all_waves_PID3$SD_PID3)
hist(all_waves_PID7$SD_PID7)
hist(all_waves_PID3$SD_PID3)

mean(all_waves_PID7$SD_PID7)
mean(all_waves_PID3$SD_PID3)



####OLS and IV Regressions for Coefficients Table
###Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]
all_waves_PID3 = vsg[, c('pid3_2011','pid3_2012','pid3_2016','pid3_2017','pid3_2018','pid3_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)
all_waves_PID3 = convert_to_numeric_PID3(all_waves_PID3)

##Filtering by rows
all_waves_PID7 = all_waves_PID7[rowSums(is.na(all_waves_PID7))==0,]
all_waves_PID3 = all_waves_PID3[rowSums(is.na(all_waves_PID3))==0,]

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain
summary(rowSums(is.na(all_waves_PID3))) #Checking that the correct type of cases remain

##OLS Regressions
summary(lm(pid7_2012 ~ pid7_2011, data = all_waves_PID7))
summary(lm(pid7_2016 ~ pid7_2012, data = all_waves_PID7))
summary(lm(pid7_2017 ~ pid7_2016, data = all_waves_PID7))
summary(lm(pid7_2018 ~ pid7_2017, data = all_waves_PID7))
summary(lm(pid7_2019 ~ pid7_2018, data = all_waves_PID7))

##IV Regressions
library('ivreg')
summary(ivreg(pid7_2016 ~ pid7_2012, ~ pid7_2011, data = all_waves_PID7))
summary(ivreg(pid7_2017 ~ pid7_2016, ~ pid7_2012, data = all_waves_PID7))
summary(ivreg(pid7_2018 ~ pid7_2017, ~ pid7_2016, data = all_waves_PID7))
summary(ivreg(pid7_2019 ~ pid7_2018, ~ pid7_2017, data = all_waves_PID7))

##Computing Implied Wiley-Wiley (1970) R^2s
ww1970_r_squares = rep(NA,3) 
for (i in 2:4){
  ww1970_r_squares[i-1] = wiley_r_squared(all_waves_PID7,i)
}

ww1970_r_squares



####Cross-Panel Analysis
###VSG VS. ISCAP
##Creating data set [COMPLETE CASES ONLY]
#Filtering by columns
all_waves_PID7_VSG_vs_ISCAP = vsg[, c('pid7_2012','pid7_2016','pid7_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7_VSG_vs_ISCAP = convert_to_numeric_PID7(all_waves_PID7_VSG_vs_ISCAP)

##Filtering by rows
all_waves_PID7_VSG_vs_ISCAP = all_waves_PID7_VSG_vs_ISCAP[rowSums(is.na(all_waves_PID7_VSG_vs_ISCAP))==0,]

summary(rowSums(is.na(all_waves_PID7_VSG_vs_ISCAP))) #Checking that the correct type of cases remain

##Computing statistics
#Correlations
cor_PID7_listwise = round(cor(all_waves_PID7_VSG_vs_ISCAP),3)

lower.tri(cor_PID7_listwise, diag = FALSE)
upper.tri(cor_PID7_listwise, diag = FALSE)
lower_7_listwise<-cor_PID7_listwise
lower_7_listwise[lower.tri(cor_PID7_listwise, diag=TRUE)]=""
lower_7_listwise<-as.data.frame(lower_7_listwise)
lower_7_listwise
write.csv(lower_7_listwise, "./VSG_vs_ISCAP_PID7_correlations.csv")

#Means and SDs (Individuals)
all_waves_PID7_VSG_vs_ISCAP$Mean_PID7 = apply(all_waves_PID7_VSG_vs_ISCAP[,1:3], 1, mean, na.rm = TRUE)
all_waves_PID7_VSG_vs_ISCAP$SD_PID7 = apply(all_waves_PID7_VSG_vs_ISCAP[,1:3], 1, sd, na.rm = TRUE)

summary(all_waves_PID7_VSG_vs_ISCAP$Mean_PID7)
summary(all_waves_PID7_VSG_vs_ISCAP$SD_PID7)

#Means and SDs (Waves)
as.data.frame(round(apply(all_waves_PID7_VSG_vs_ISCAP, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID7_VSG_vs_ISCAP, 2, sd, na.rm=TRUE),2))

##Time Series and IV Regressions
#OLS Regressions
summary(lm(pid7_2016 ~ pid7_2012, data = all_waves_PID7_VSG_vs_ISCAP))
summary(lm(pid7_2019 ~ pid7_2016, data = all_waves_PID7_VSG_vs_ISCAP))

#IV Regressions
library('ivreg')
summary(ivreg(pid7_2019 ~ pid7_2016, ~ pid7_2012, data = all_waves_PID7_VSG_vs_ISCAP))

###VSG VS. TAPS
##Creating data set [COMPLETE CASES ONLY]
#Filtering by columns
all_waves_PID7_VSG_vs_TAPS = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017')]

##Converting values to new scale and formatting as numeric
all_waves_PID7_VSG_vs_TAPS = convert_to_numeric_PID7(all_waves_PID7_VSG_vs_TAPS)

##Filtering by rows
all_waves_PID7_VSG_vs_TAPS = all_waves_PID7_VSG_vs_TAPS[rowSums(is.na(all_waves_PID7_VSG_vs_TAPS))==0,]

summary(rowSums(is.na(all_waves_PID7_VSG_vs_TAPS))) #Checking that the correct type of cases remain

##Computing statistics
#Correlations
cor_PID7_listwise = round(cor(all_waves_PID7_VSG_vs_TAPS),3)

lower.tri(cor_PID7_listwise, diag = FALSE)
upper.tri(cor_PID7_listwise, diag = FALSE)
lower_7_listwise<-cor_PID7_listwise
lower_7_listwise[lower.tri(cor_PID7_listwise, diag=TRUE)]=""
lower_7_listwise<-as.data.frame(lower_7_listwise)
lower_7_listwise
write.csv(lower_7_listwise, "./VSG_vs_TAPS_PID7_correlations.csv")

#Means and SDs (Individuals)
all_waves_PID7_VSG_vs_TAPS$Mean_PID7 = apply(all_waves_PID7_VSG_vs_TAPS[,1:4], 1, mean, na.rm = TRUE)
all_waves_PID7_VSG_vs_TAPS$SD_PID7 = apply(all_waves_PID7_VSG_vs_TAPS[,1:4], 1, sd, na.rm = TRUE)

summary(all_waves_PID7_VSG_vs_TAPS$Mean_PID7)
summary(all_waves_PID7_VSG_vs_TAPS$SD_PID7)

#Means and SDs (Waves)
as.data.frame(round(apply(all_waves_PID7_VSG_vs_TAPS, 2, mean, na.rm=TRUE),2))
as.data.frame(round(apply(all_waves_PID7_VSG_vs_TAPS, 2, sd, na.rm=TRUE),2))

##Time Series and IV Regressions
#OLS Regressions
summary(lm(pid7_2012 ~ pid7_2011, data = all_waves_PID7_VSG_vs_TAPS))
summary(lm(pid7_2016 ~ pid7_2012, data = all_waves_PID7_VSG_vs_TAPS))
summary(lm(pid7_2017 ~ pid7_2016, data = all_waves_PID7_VSG_vs_TAPS))

#IV Regressions
library('ivreg')
summary(ivreg(pid7_2016 ~ pid7_2012, ~ pid7_2011, data = all_waves_PID7_VSG_vs_TAPS))
summary(ivreg(pid7_2017 ~ pid7_2016, ~ pid7_2012, data = all_waves_PID7_VSG_vs_TAPS))

#Computing Implied Wiley-Wiley (1970) R^2s
wiley_r_squared(all_waves_PID7_VSG_vs_TAPS,2)



####Time Series and IV Regressions [INCLUDING WEIGHTS]
###Creating data set [COMPLETE CASES ONLY]
##Filtering by columns
all_waves_PID7 = vsg[, c('pid7_2011','pid7_2012','pid7_2016','pid7_2017','pid7_2018','pid7_2019')]

##Converting values to new scale and formatting as numeric
all_waves_PID7 = convert_to_numeric_PID7(all_waves_PID7)

##Creating a filter label
all_waves_PID7$filter = ifelse(rowSums(is.na(all_waves_PID7))==0,1,0)

##Appending a weight for complete case respondents (see 2019 dfvsg_Guide PDF)
weights = vsg[,c('weight_2016','weight_2017','weight_panel_2018','weight_2019')]

for(i in 1:nrow(weights)){
  for(j in 1:ncol(weights)){
    weights[i,j] = ifelse(is.na(weights[i,j]),1,weights[i,j])
  }
}

all_waves_PID7 = cbind(all_waves_PID7,weights)

##Filtering by rows
all_waves_PID7 = subset(all_waves_PID7, filter==1)

summary(rowSums(is.na(all_waves_PID7))) #Checking that the correct type of cases remain

##Creating multiple dataframes
complete_cases_11_12_16_PID7 = all_waves_PID7[, c('pid7_2011','pid7_2012','pid7_2016','weight_2016')]
complete_cases_12_16_17_PID7 = all_waves_PID7[, c('pid7_2012','pid7_2016','pid7_2017','weight_2017')]
complete_cases_16_17_18_PID7 = all_waves_PID7[, c('pid7_2016','pid7_2017','pid7_2018','weight_panel_2018')]
complete_cases_17_18_19_PID7 = all_waves_PID7[, c('pid7_2017','pid7_2018','pid7_2019','weight_2019')]

names(complete_cases_11_12_16_PID7)[1:4] <- c('PID_t_0','PID_t_1','PID_t_2','weight')
names(complete_cases_12_16_17_PID7)[1:4] <- c('PID_t_0','PID_t_1','PID_t_2','weight')
names(complete_cases_16_17_18_PID7)[1:4] <- c('PID_t_0','PID_t_1','PID_t_2','weight')
names(complete_cases_17_18_19_PID7)[1:4] <- c('PID_t_0','PID_t_1','PID_t_2','weight')

##Creating combined dataframes
complete_cases_rbind_PID7 = rbind(complete_cases_11_12_16_PID7,complete_cases_12_16_17_PID7,complete_cases_16_17_18_PID7,complete_cases_17_18_19_PID7)

##Adding a wave column
w1_PID7 = as.data.frame(rep('2011',nrow(complete_cases_11_12_16_PID7)))
names(w1_PID7)[1] <- 'wave'
w2_PID7 = as.data.frame(rep('2012',nrow(complete_cases_12_16_17_PID7)))
names(w2_PID7)[1] <- 'wave'
w3_PID7 = as.data.frame(rep('2016',nrow(complete_cases_16_17_18_PID7)))
names(w3_PID7)[1] <- 'wave'
w4_PID7 = as.data.frame(rep('2017',nrow(complete_cases_17_18_19_PID7)))
names(w4_PID7)[1] <- 'wave'

complete_cases_rbind_PID7_waves = rbind(w1_PID7,w2_PID7,w3_PID7,w4_PID7)
complete_cases_rbind_PID7 = cbind(complete_cases_rbind_PID7,complete_cases_rbind_PID7_waves)
summary(complete_cases_rbind_PID7)

##IV Regressions
library('ivreg')
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_rbind_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_rbind_PID7, weights = weight))

summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_11_12_16_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_11_12_16_PID7, weights = weight))

summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_12_16_17_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_12_16_17_PID7, weights = weight))

summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_16_17_18_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_16_17_18_PID7, weights = weight))

summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_17_18_19_PID7))
summary(ivreg(PID_t_2 ~ PID_t_1, ~ PID_t_0, data=complete_cases_17_18_19_PID7, weights = weight))